import re
from httputils import ReqUtil
import json


# 抓取内涵段子网站页面
def getNeihan():
    # 准备http请求头
    url = "http://neihanshequ.com/"
    origin = "http://neihanshequ.com/"
    user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36"
    referer = "http://neihanshequ.com/bar/1/"
    cookie='uuid="w:ac534271819845f3a9444c2e308f04b6"; tt_webid=62647603619; csrftoken=eaacdef07ccc43397e4735837301663e; _ga=GA1.2.1114156404.1504764406; _gid=GA1.2.992552767.1504764406; Hm_lvt_773f1a5aa45c642cf87eef671e4d3f6a=1504764406; Hm_lpvt_773f1a5aa45c642cf87eef671e4d3f6a=1504764967'

    req = ReqUtil()
    data = req.send_get(url=url, origin_url=origin, user_agent=user_agent, referer=referer,cookie=cookie, login_data=[])
    return data

# 抓取段子评论页面
def getComment(url):
    # 准备http请求头
    url = url
    origin = "http://neihanshequ.com/"
    user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36"
    referer = "http://neihanshequ.com/"
    cookie = 'uuid="w:ac534271819845f3a9444c2e308f04b6"; tt_webid=62647603619; login_flag=b09946dddd7088eebb5df9be54de330c; sessionid=f9af44e7e8919563d7187ab4f12d3ef0; uid_tt=3ea64d55d2f5e5f112aa7f7401de4351; sid_tt=f9af44e7e8919563d7187ab4f12d3ef0; sid_guard="f9af44e7e8919563d7187ab4f12d3ef0|1504773824|2592000|Sat\054 07-Oct-2017 08:43:44 GMT"; csrftoken=eaacdef07ccc43397e4735837301663e; Hm_lvt_773f1a5aa45c642cf87eef671e4d3f6a=1504764406; Hm_lpvt_773f1a5aa45c642cf87eef671e4d3f6a=1504773836; _ga=GA1.2.1114156404.1504764406; _gid=GA1.2.992552767.1504764406; _gat=1'
    req = ReqUtil()
    data = req.send_get(url=url, origin_url=origin, user_agent=user_agent, referer=referer,cookie=cookie, login_data=[])
    return data

pageData = getNeihan()
# 抓取段子整体，包含点赞数和内容
pa = re.compile(r'<div class="content-wrapper">(.*?)<li class="digg-wrapper ">(.*?)</ul>', re.S)
res = pa.findall(pageData)
for v in res:
    # 获取点赞数
    goodsum = re.findall(r'<span class="digg">(.*?)</span>', str(v), re.S)
    if (int(goodsum[0]) > 5000):
        # 获取段子内容
        content = re.findall(r'<p>(.*?)</p>', str(v), re.S)
        print('点赞数: %s %s' % (goodsum[0],content[0]))
        # 获取段子gid
        gidList = re.findall(r'data-group-id="(.*?)"', str(v), re.S)
        gid = gidList[0]
        # 获取评论（jsonStr）
        commentS = getComment("http://neihanshequ.com/m/api/get_essay_comments/?group_id="+gid+"&app_name=neihanshequ_web")
        # print(comment) 所有评论
        commentJ=json.loads(commentS)
        print("热门评论：")
        sum=1;
        for x in commentJ.get("data").get("top_comments"):
            #最多显示5条评论
            if(sum <= 5):
                print("  %s  ----%s" % (x.get("text"), x.get("user_name")))
                sum+=1
            else:
                continue
        print()



